*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: 'data_counterfactual_analysis.dta'
* 			 		 
*	- Output: Figures 1, S1, and S2; Tables S2, S3, and S4; robustness_discontinuity.dta
*			 
* Topic: Discontinuity Tests
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.

if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}

*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_6.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Load Data
* 2. Table S2 (p-values with free bandwidth)
* 3. Robustness: test on one-sided statistics (p-values with free bandwidth)
* 4. Figure 1
* 5. Figure S2 (one-sided statistics)
* 6. Table S3 (checking for spike)
* 7. Robustness




*--------------------------*
* Section 1 -- Load Data *
*--------------------------*

use "${data}${final}data_counterfactual_analysis.dta", clear

local threshold=-invnormal(.05/2)

gen z_1s = -invnormal(p)
local t_1s=-invnormal(.05)

keep if Ph2==1 | Ph3==1

preserve
gen SPLIT=top10rev

*------------------------------------------------------*
* Section 2 -- Table S2 (p-values with free bandwidth) *
*------------------------------------------------------*

putexcel set "${run}${table}tabS2.xls", replace

putexcel A1=(" ") B1=("Phase II") C1=("Phase III") 
rddensity z  if z_modifier==0 & Ph2==1,c(`threshold') 
putexcel A2=("All") B2=(round(e(pv_q),0.01)) B3=(e(N))
count if z_modifier==0 & Ph2==1
local aux=r(N)
putexcel B3=("(`aux')")
rddensity z  if z_modifier==0 & Ph3==1,c(`threshold') 
putexcel C2=(round(e(pv_q),0.01)) 
count if z_modifier==0 & Ph3==1
local aux=r(N)
putexcel C3=("(`aux')")

rddensity z  if z_modifier==0 & INDUSTRY==0 & Ph2==1,c(`threshold') 
putexcel A4=("Non-industry") B4=(round(e(pv_q),0.01)) 
count if z_modifier==0 & INDUSTRY==0 & Ph2==1
local aux=r(N)
putexcel B5=("(`aux')")
rddensity z  if z_modifier==0& INDUSTRY==0 & Ph3==1,c(`threshold')
putexcel C4=(round(e(pv_q),0.01))
count  if z_modifier==0& INDUSTRY==0 & Ph3==1
local aux=r(N)
putexcel C5=("(`aux')")

rddensity z  if z_modifier==0& INDUSTRY==1 & Ph2==1,c(`threshold') 
putexcel A6=("All industry") B6=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1 & Ph2==1
local aux=r(N)
putexcel B7=("(`aux')")
rddensity z  if z_modifier==0& INDUSTRY==1 & Ph3==1,c(`threshold')
putexcel C6=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1 & Ph3==1
local aux=r(N)
putexcel C7=("(`aux')")

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1,c(`threshold') 
putexcel A8=("Small industry") B8=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1
local aux=r(N)
putexcel B9=("(`aux')")
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`threshold') 
putexcel C8=(round(e(pv_q),0.001)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1
local aux=r(N)
putexcel C9=("(`aux')")

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1,c(`threshold') 
putexcel A10=("Top 10 industry") B10=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1
local aux=r(N)
putexcel B11=("(`aux')")
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`threshold')
putexcel C10=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1
local aux=r(N)
putexcel C11=("(`aux')")

*---------------------------------------------------------------------------------------*
* Section 3 -- Robustness: test on one-sided statistics (p-values with free bandwidth)  *
*---------------------------------------------------------------------------------------*

putexcel set "${run}${table}tabS4.xls", replace

putexcel A1=(" ") B1=("Phase II") C1=("Phase III") 
rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1,c(`t_1s') 
putexcel A2=("Small industry") B2=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1
local aux=r(N)
putexcel B3=("(`aux')")
rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`t_1s') 
putexcel C2=(round(e(pv_q),0.001)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1
local aux=r(N)
putexcel C3=("(`aux')")

rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1,c(`t_1s') 
putexcel A4=("Top 10 industry") B4=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1
local aux=r(N)
putexcel B5=("(`aux')")
rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`t_1s') 
putexcel C4=(round(e(pv_q),0.01)) 
count if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1
local aux=r(N)
putexcel C5=("(`aux')")


*-----------------------*
* Section 4 -- Figure 1 *
*-----------------------*


qui rddensity z  if z_modifier==0 & Ph2==1,c(`threshold') h(0.8) genvars(Phase2) plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw) 

qui rddensity z  if z_modifier==0 & Ph3==1,c(`threshold') h(0.8) genvars(Phase3)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==0 & Ph2==1,c(`threshold') h(0.6) genvars(Phase2NI)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==0 & Ph3==1,c(`threshold') h(1) genvars(Phase3NI)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1,c(`threshold') h(1.1) genvars(Phase2SI)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`threshold') h(0.8) genvars(Phase3SI)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1,c(`threshold') h(1.1) genvars(Phase2big)  plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`threshold') h(0.8) genvars(Phase3big)   plot plot_range(0 4.5) plot_n(100 100) graph_options(nodraw)

local outcomeList "Phase2 Phase3 Phase2NI Phase3NI Phase2SI Phase3SI Phase2big Phase3big"
foreach outcome of local outcomeList {
replace `outcome'_grid=. if `outcome'_group==1&`outcome'_group[_n-1]==0
replace `outcome'_grid=. if `outcome'_grid>4
replace `outcome'_grid=. if `outcome'_grid<0.05
replace `outcome'_cil=0 if `outcome'_cil<0
}



twoway (connected Phase2_f Phase2_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3_f Phase3_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2_cil Phase2_cir Phase2_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3_cil Phase3_cir Phase3_grid if Phase3_grid<10, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle(z) title("{bf:A}", pos(11) size(huge)) subtitle("All sponsors",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.96, lwidth(medthick) lpattern(dash) lcolor(black)) text(0.6 2.325 "z=1.96") text(0.325 2.08 "{bf:***}", color(gs6)) text(0.19 2.08 "{bf:*}", color(ebblue)) plotr(ls(none)) scheme(s1color) name(Cattaneo_All,replace) legend(off) nodraw
//
twoway (connected Phase2NI_f Phase2NI_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3NI_f Phase3NI_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2NI_cil Phase2NI_cir Phase2NI_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3NI_cil Phase3NI_cir Phase3NI_grid if Phase3NI_grid<10, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle(z) title("{bf:B}", pos(11) size(huge)) subtitle("Non-industry",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.96, lwidth(medthick) lpattern(dash) lcolor(black))  text(0.8 2.325 "z=1.96") plotr(ls(none)) scheme(s1color) name(Cattaneo_NonIndustry,replace) legend(off) nodraw
//
twoway (connected Phase2SI_f Phase2SI_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3SI_f Phase3SI_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2SI_cil Phase2SI_cir Phase2SI_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3SI_cil Phase3SI_cir Phase3SI_grid, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle(z) title("{bf:D}", pos(11) size(huge)) subtitle("Small industry",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.96, lwidth(medthick) lpattern(dash) lcolor(black)) legend(order(1 "Phase II" 2 "Phase III"))  text(0.6 2.325 "z=1.96") text(0.35 2.08 "{bf:**}", color(gs6)) plotr(ls(none)) scheme(s1color) name(Cattaneo_smallIndustry,replace) nodraw
// 
twoway (connected Phase2big_f Phase2big_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3big_f Phase3big_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2big_cil Phase2big_cir Phase2big_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3big_cil Phase3big_cir Phase3big_grid, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle(z) title("{bf:C}", pos(11) size(huge)) subtitle("Top 10 industry",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.96, lwidth(medthick) lpattern(dash) lcolor(black)) text(0.8 2.325 "z=1.96") plotr(ls(none)) scheme(s1color) name(Cattaneo_top10,replace) legend(off) nodraw
//

grc1leg Cattaneo_All Cattaneo_NonIndustry Cattaneo_top10 Cattaneo_smallIndustry, imargin(0 0 0 0) scheme(s1color) legendfrom(Cattaneo_smallIndustry) name(fig1, replace) ysize(16) xsize(20)
graph export "${run}${graph}fig1.png", replace width(1500)



*-----------------------------------------------*
* Section 5 -- Figure S2 (one-sided statistics) *
*-----------------------------------------------*

qui rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1,c(`t_1s') h(1.8) genvars(Phase2SI1s)  plot plot_range(-4 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`t_1s') h(1.8) genvars(Phase3SI1s)  plot plot_range(-4 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1,c(`t_1s') h(1.18 1.8) genvars(Phase2big1s)  plot plot_range(-4 4.5) plot_n(100 100) graph_options(nodraw)

qui rddensity z_1s  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`t_1s') h(2.6 2) genvars(Phase3big1s)   plot plot_range(-4 4.5) plot_n(100 100) graph_options(nodraw)



local outcomeList "Phase2SI1s Phase3SI1s Phase2big1s Phase3big1s"
foreach outcome of local outcomeList {
replace `outcome'_grid=. if `outcome'_group==1&`outcome'_group[_n-1]==0
replace `outcome'_grid=. if `outcome'_grid>4
replace `outcome'_grid=. if `outcome'_grid<-3.5
replace `outcome'_cil=0 if `outcome'_cil<0
replace `outcome'_f=. if `outcome'_f<0
replace `outcome'_cil=. if `outcome'_f==.
replace `outcome'_cir=. if `outcome'_f==.
replace `outcome'_cir=0.5 if `outcome'_cir>0.5
}


twoway (connected Phase2SI1s_f Phase2SI1s_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3SI1s_f Phase3SI1s_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2SI1s_cil Phase2SI1s_cir Phase2SI1s_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3SI1s_cil Phase3SI1s_cir Phase3SI1s_grid, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle("z{subscript:1-sided}") title("{bf:B}", pos(11) size(huge)) subtitle("Small industry",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.64, lwidth(medthick) lpattern(dash) lcolor(black)) legend(order(1 "Phase II" 2 "Phase III"))  text(0.45 2.8 "z{subscript:1-sided}=1.64") text(0.325 2.08 "{bf:*}", color(gs6)) plotr(ls(none)) scheme(s1color) name(Cattaneo_smallIndustry1s,replace) nodraw
// 
twoway (connected Phase2big1s_f Phase2big1s_grid, msymbol(none) lcolor(ebblue) lwidth(thick) lpattern(dash) cmissing(n)) (connected Phase3big1s_f Phase3big1s_grid, msymbol(none) lcolor(gs6) lwidth(thick) lpattern(solid) cmissing(n)) (rarea Phase2big1s_cil Phase2big1s_cir Phase2big1s_grid, fcolor(ebblue%20) lwidth(none) cmissing(n)) (rarea Phase3big1s_cil Phase3big1s_cir Phase3big1s_grid, fcolor(gs6%25) lwidth(none) cmissing(n)), ytitle(Density) xtitle("z{subscript:1-sided}") title("{bf:A}", pos(11) size(huge)) subtitle("Top 10 industry",pos(12) size(large)) ylabel( , format(%03.1f)) xline(1.64, lwidth(medthick) lpattern(dash) lcolor(black)) text(0.45 2.8 "z{subscript:1-sided}=1.64") plotr(ls(none)) scheme(s1color) name(Cattaneo_top101s,replace) legend(off) nodraw
// 

grc1leg Cattaneo_top101s Cattaneo_smallIndustry1s, imargin(0 0 0 0) scheme(s1color) legendfrom(Cattaneo_smallIndustry1s) name(fig1_1s, replace) ycom ysize(2) xsize(10)
graph export "${run}${graph}figS2.png", replace width(1500)
  

*--------------------------------------------*
* Section 6 -- Table S3 (checking for spike) *
*--------------------------------------------*

local t1=-invnormal(.05/2)
local t2=`t1'+.05
local t3=`t1'+.5 


*-----*
* 6.1 *
*-----*

* Test Statistics

putexcel set "${run}${table}tabS3.xls" , sheet("test statistics") replace
putexcel A1=(" ") A2=("Phase III") A3=("All industry") A4=("Small industry") A5=("Top 10 industry") 

forvalues j = 1(1)3 {
if `j'==1 {
  local col B
}
if `j'==2 {
  local col C
}
if `j'==3 {
  local col D
}

local threshold `t`j''

putexcel `col'1=(round(`threshold',0.01))

rddensity z  if z_modifier==0& INDUSTRY==1 & Ph3==1,c(`threshold')
putexcel `col'3=(round(e(f_qr)-e(f_ql),0.001)) 
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`threshold') 
putexcel `col'4=(round(e(f_qr)-e(f_ql),0.001)) 
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`threshold') 
putexcel `col'5=(round(e(f_qr)-e(f_ql),0.001)) 


}


*-----*
* 6.2 *
*-----*

* P-Values

putexcel set "${run}${table}tabS3.xls" , sheet("p values") modify
putexcel A1=(" ") A2=("Phase III") A3=("All industry") A4=("Small industry") A5=("Top 10 industry") 

forvalues j = 1(1)3 {
if `j'==1 {
  local col B
}
if `j'==2 {
  local col C
}
if `j'==3 {
  local col D
}

local threshold `t`j''

putexcel `col'1=(round(`threshold',0.01))

rddensity z  if z_modifier==0& INDUSTRY==1 & Ph3==1,c(`threshold')
putexcel `col'3=(round(e(pv_q),0.001)) 
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`threshold') 
putexcel `col'4=(round(e(pv_q),0.001)) 
rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`threshold') 
putexcel `col'5=(round(e(pv_q),0.001)) 
}

restore

*-------------------------*
* Section 7 -- Robustness *
*-------------------------*

local threshold=-invnormal(.05/2)
local criteria rev_rank rxsales_rank rdspend_rank rep_rank

tempname memhold
postfile `memhold' str20 criterion rank Ph2_small Ph2_large Ph3_small Ph3_large using "${data}${final}robustness_discontinuity.dta", replace

foreach criterion of local criteria {
forval c = 7 / 20 {

disp in red "`criterion'  `c'"
quietly {
preserve
gen SPLIT=(`criterion'<=`c')
sleep 500

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph2==1,c(`threshold') 
local s2 = e(pv_q)

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==0 & Ph3==1,c(`threshold') 
local s3 = e(pv_q)

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph2==1,c(`threshold') 
local l2 = e(pv_q)

rddensity z  if z_modifier==0& INDUSTRY==1  & SPLIT==1 & Ph3==1,c(`threshold')
local l3 = e(pv_q)

post `memhold' ("`criterion'") (`c') (`s2') (`l2') (`s3') (`l3')

restore
}
}
}

postclose `memhold'

use "${data}${final}robustness_discontinuity.dta", clear

graph twoway (histogram Ph2_large, frac width(.005) start(0) fcol(navy) lcol(navy)) , ytitle(Fraction) xtitle(p) xline(.05, lwidth(medthick) lpattern(dash) lcolor(black)) title("{bf:A}", pos(11) size(huge)) subtitle("Phase II large industry",pos(12) size(large)) xlabel( , format(%03.1f)) ylabel( , format(%03.2f)) name(A,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) nodraw
graph twoway (histogram Ph3_large, frac width(.005) start(0) fcol(navy) lcol(navy)) , ytitle(Fraction) xtitle(p) xline(.05, lwidth(medthick) lpattern(dash) lcolor(black)) title("{bf:B}", pos(11) size(huge)) subtitle("Phase III large industry",pos(12) size(large)) xlabel( , format(%03.1f)) ylabel( , format(%03.2f)) name(B,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) nodraw
graph twoway (histogram Ph2_small, frac width(.005) start(0) fcol(navy) lcol(navy)) , ytitle(Fraction) xtitle(p) xline(.05, lwidth(medthick) lpattern(dash) lcolor(black)) title("{bf:C}", pos(11) size(huge)) subtitle("Phase II small industry",pos(12) size(large)) xlabel( , format(%03.1f)) ylabel( , format(%03.2f)) name(C,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) nodraw
graph twoway (histogram Ph3_small, frac width(.005) start(0) fcol(navy) lcol(navy)) , ytitle(Fraction) xtitle(p) xline(.05, lwidth(medthick) lpattern(dash) lcolor(black)) title("{bf:D}", pos(11) size(huge)) subtitle("Phase III small industry",pos(12) size(large)) xlabel( , format(%03.1f)) ylabel( , format(%03.2f)) name(D,replace) graphregion(color(white)) bgcolor(white) plotr(ls(none)) nodraw

graph combine A B C D, c(2) name(robustness_discontinuity,replace) ycom xcom
graph export "${run}${graph}figS1.png", replace width(1500)

log close 
cd "${run}${dofiles}"
